# download_jola_issue.py
# JoLA (Journal of Learning Analytics) Downloader
# -------------------------------------------------
# Automates downloading PDFs from JoLA OJS 3.x issue pages
# - Parses article titles and galley links from the issue page
# - Resolves actual download links from "view" pages (avoiding inline view-only links)
# - Handles both absolute and relative URLs with urljoin for reliability
# - Saves each PDF with sanitized filenames for cross-platform compatibility
# - Creates dynamic folder names like JoLA_Vol7_Issue1_2020 from issue metadata
# - Logs all downloads and skipped items to CSV
# - Reusable for other OJS 3.x journals with similar galley structures
# Journal of Learning Analytics (JoLA) — Issue Downloader (LIVE URL)
# download_jla_issue_live.py
# -------------------------------------------------
# Input: an issue URL (e.g., https://learning-analytics.info/index.php/JLA/issue/view/478)
# Behavior:
#   1) Scrape article titles from the issue page:
#        .obj_article_summary h3.title a[href]
#      and the PDF "view" galley link:
#        .galleys_links a.obj_galley_link.pdf[href]
#   2) Resolve the true PDF download URL by:
#        a) First trying article page <meta name="citation_pdf_url" ...>
#        b) Else opening the "view" URL and grabbing <a class="download" ...>
#   3) Save PDFs to ./JLA_Vol{vol}_Issue{iss}_{year}/<Sanitized Title>.pdf
#   4) Write a CSV log with Title, Article URL, View URL, PDF URL, Status
#
# Options:
#   --dry-run    : list what would be downloaded (no files saved)
#   --max N      : cap number of downloads
#   --delay S    : pause S seconds between saves
#   --skip-headings "Editorial|Book Review|Book Notes" : regex to skip titles
#
# Requirements: pip install requests beautifulsoup4
#



import re
import csv
import time
import argparse
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    )
}
TIMEOUT = 60
RETRY_SLEEP = 2
MAX_RETRIES = 3

def sanitize_filename(name: str) -> str:
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = re.sub(r"\s+", " ", name).strip()
    name = re.sub(r"\.+", ".", name).strip(". ")
    return name[:180]

def get(url: str, referer: str | None = None) -> requests.Response:
    headers = dict(HEADERS)
    if referer:
        headers["Referer"] = referer
    last = None
    for _ in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=headers, timeout=TIMEOUT)
            r.raise_for_status()
            return r
        except Exception as e:
            last = e
            time.sleep(RETRY_SLEEP)
    raise last

def soup_for(url: str, referer: str | None = None) -> BeautifulSoup:
    r = get(url, referer=referer)
    return BeautifulSoup(r.text, "html.parser")

def ensure_pdf(resp: requests.Response) -> bool:
    ctype = (resp.headers.get("Content-Type") or "").lower()
    return ("pdf" in ctype) or (resp.content[:5] == b"%PDF-")

def parse_issue_meta(issue_soup: BeautifulSoup, issue_url: str) -> tuple[str, str, str]:
    """
    Parse 'Vol. 7 No. 1 (2020): ...' from <title> or <h1> to build folder name.
    Returns (vol, issue, year). Defaults to strings if not found.
    """
    text = ""
    if issue_soup.title:
        text = issue_soup.title.get_text(" ", strip=True)
    if not text:
        h1 = issue_soup.select_one("h1")
        if h1:
            text = h1.get_text(" ", strip=True)

    m = re.search(r"Vol\.\s*(\d+)\s+No\.\s*(\d+)\s*\((\d{4})\)", text)
    if m:
        return m.group(1), m.group(2), m.group(3)
    # fallback: try Published date for year
    year = ""
    pub = issue_soup.select_one(".published .value")
    if pub:
        m2 = re.search(r"(19|20)\d{2}", pub.get_text(" ", strip=True))
        if m2:
            year = m2.group(0)
    return "Vol", "Issue", year or "Year"

def collect_articles(issue_soup: BeautifulSoup) -> list[dict]:
    """
    Collect title, article_url, view_url from issue page.
    - Title: .obj_article_summary h3.title a
    - View link: .galleys_links a.obj_galley_link.pdf
    """
    items = []
    # JoLA issue page pattern (OJS 3.x) — confirmed in uploaded HTML. :contentReference[oaicite:3]{index=3}
    for block in issue_soup.select(".obj_article_summary"):
        a_title = block.select_one("h3.title a[href]")
        if not a_title:
            continue
        title = a_title.get_text(" ", strip=True)
        article_url = a_title.get("href", "").strip()

        # The "PDF" link shown is usually a 'view' galley, not the true download.
        a_view = block.select_one(".galleys_links a.obj_galley_link.pdf[href]")
        view_url = a_view.get("href").strip() if a_view else ""

        items.append({
            "title": title,
            "article_url": article_url,
            "view_url": view_url
        })
    return items

def find_pdf_url(article_url: str, view_url: str) -> str | None:
    """
    Resolve the real PDF URL:
      1) Try article page meta: <meta name="citation_pdf_url" content=".../article/download/ID/GALLEY">
      2) If not present/valid, open the view URL and read the header's <a class="download" ...>
         (that's the canonical /article/download/ID/GALLEY[/FILEID] link in OJS). :contentReference[oaicite:4]{index=4}
    """
    # 1) Article page meta
    try:
        art = soup_for(article_url, referer=article_url)
        meta = art.select_one('meta[name="citation_pdf_url"][content]')
        if meta:
            cand = meta["content"].strip()
            if cand:
                return cand  # OJS accepts /download/ID/GALLEY; FILEID is optional. :contentReference[oaicite:5]{index=5}
    except Exception:
        pass

    # 2) Viewer page -> Download button
    if view_url:
        try:
            view = soup_for(view_url, referer=article_url)
            a = view.select_one("a.download[href]")
            if a:
                return a["href"].strip()  # usually absolute
        except Exception:
            pass

    return None

def main():
    p = argparse.ArgumentParser(description="Download JoLA PDFs from an issue page (LIVE URL, OJS 3.x)")
    p.add_argument("issue_url", nargs="?", help="Issue URL, e.g., https://learning-analytics.info/index.php/JLA/issue/view/478")
    p.add_argument("--dry-run", action="store_true", help="List targets without downloading")
    p.add_argument("--max", type=int, default=0, help="Download at most N PDFs")
    p.add_argument("--delay", type=float, default=0.0, help="Seconds to sleep between downloads")
    p.add_argument("--skip-headings", default=r"Editorial|Book Review|Book Notes", help="Regex of title fragments to skip")
    args = p.parse_args()

    issue_url = args.issue_url or input("Paste JoLA issue URL: ").strip()
    if not issue_url:
        print("ERROR: No issue URL provided.")
        return

    issue_soup = soup_for(issue_url)
    vol, iss, year = parse_issue_meta(issue_soup, issue_url)
    outdir = Path(f"JLA_Vol{vol}_Issue{iss}_{year}")
    outdir.mkdir(parents=True, exist_ok=True)
    log_csv = outdir / f"JLA_Vol{vol}_Issue{iss}_{year}_log.csv"

    articles = collect_articles(issue_soup)
    print(f"[INFO] Found {len(articles)} entries on the issue page")

    skip_re = re.compile(args.skip_headings, flags=re.I) if args.skip_headings else None
    entries = []
    for it in articles:
        title = it["title"]
        if skip_re and skip_re.search(title):
            print(f"[SKIP] {title}")
            continue
        entries.append(it)

    print(f"[INFO] After skipping: {len(entries)} articles")

    # Dry-run: show resolved PDF URLs
    if args.dry_run:
        for i, it in enumerate(entries, 1):
            pdf_url = find_pdf_url(it["article_url"], it["view_url"])
            print(f"[{i}] {it['title']}\n    Article: {it['article_url']}\n    View   : {it['view_url'] or '(none)'}\n    PDF    : {pdf_url or '[NO PDF FOUND]'}")
        print("[DRY-RUN] No downloads performed.")
        return

    saved = 0
    with log_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["Title", "Article URL", "View URL", "PDF URL", "Filename", "Status"])

        for i, it in enumerate(entries, 1):
            if args.max and saved >= args.max:
                break

            title = it["title"]
            article_url = it["article_url"]
            view_url = it["view_url"]

            pdf_url = find_pdf_url(article_url, view_url)
            if not pdf_url:
                w.writerow([title, article_url, view_url, "", "", "Skipped (no PDF)"])
                print(f"[{i}] ⚠️ No PDF: {title}")
                continue

            # Some PDF links are relative; normalize
            if pdf_url.startswith("/"):
                base = "{u.scheme}://{u.netloc}".format(u=urlparse(article_url))
                pdf_url = urljoin(base, pdf_url)

            try:
                resp = get(pdf_url, referer=article_url)
                if not ensure_pdf(resp):
                    w.writerow([title, article_url, view_url, pdf_url, "", "Skipped (not a PDF response)"])
                    print(f"[{i}] ❌ Not a PDF: {title}")
                    continue

                fname = sanitize_filename(title) + ".pdf"
                path = outdir / fname
                path.write_bytes(resp.content)
                w.writerow([title, article_url, view_url, pdf_url, fname, "OK"])
                print(f"[{i}] ✅ Saved: {fname}")
                saved += 1
                if args.delay > 0:
                    time.sleep(args.delay)
            except Exception as e:
                w.writerow([title, article_url, view_url, pdf_url, "", f"Error: {e}"])
                print(f"[{i}] ❌ Error: {e}")

    print(f"\nDone! {saved} PDFs saved in {outdir}")
    print(f"Log: {log_csv}")

if __name__ == "__main__":
    main()
